In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
from db_utils import query_analytics_store,query_hive_ssh
import dateutil

In [2]:
response_table = 'log.QuickSurveysResponses_15266417'
impression_table = 'log.QuickSurveyInitiation_15278946'
host = 'analytics-store.eqiad.wmnet'

EL Summary Stats


In [3]:
query = """
SELECT
    COUNT(*) as n
FROM log.QuickSurveyInitiation_15278946
WHERE
    event_eventName ='eligible'
"""

n_eligible = query_analytics_store(query, {}).iloc[0]['n']

query = """
SELECT
    COUNT(*) as n
FROM log.QuickSurveyInitiation_15278946
WHERE
    event_eventName ='impression'
"""

n_impressions = query_analytics_store(query, {}).iloc[0]['n']

query = """
SELECT
    COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
"""
n_responses = query_analytics_store(query, {}).iloc[0]['n']

query = """
SELECT
    COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
WHERE
    event_surveyResponseValue ='ext-quicksurveys-external-survey-yes-button'
"""
n_yes = query_analytics_store( query, {}).iloc[0]['n']

query = """
SELECT
    COUNT(*) as n
FROM log.QuickSurveysResponses_15266417
WHERE
    event_surveyResponseValue ='ext-quicksurveys-external-survey-no-button'
"""
n_no = query_analytics_store( query, {}).iloc[0]['n']



print('Eligible Pageviews: ', n_eligible)
print('Widget Impressions: ', n_impressions)
print('Clicks: ', n_responses)
print('Yeses: ', n_yes)
print('Nos: ', n_no)


Eligible Pageviews:  23298498
Widget Impressions:  18109333
Clicks:  210863
Yeses:  55575
Nos:  155288

Does each eligible pageview have a unique survey token?


In [25]:
query = """
SELECT COUNT(DISTINCT(event_surveyInstanceToken)) as n
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='eligible'
"""
print(query_analytics_store(query, {}).iloc[0]['n'])


86481

Pretty Close

Does each widget impression have a unique survey token?


In [26]:
query = """
SELECT COUNT(DISTINCT(event_surveyInstanceToken)) as n
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='impression'
"""
print(query_analytics_store(query, {}).iloc[0]['n'])


68065

Pretty Close.

Does each Widget Impression have a matching Pageview?


In [27]:
query = """
SELECT
    COUNT(DISTINCT(i.event_surveyInstanceToken)) as n
FROM
    (SELECT event_surveyInstanceToken
    FROM log.QuickSurveyInitiation_15278946
    WHERE
    event_eventName ='impression') i
JOIN
    (SELECT event_surveyInstanceToken
    FROM log.QuickSurveyInitiation_15278946
    WHERE
    event_eventName ='eligible') e
ON
    (i.event_surveyInstanceToken = e.event_surveyInstanceToken)
"""

df = query_analytics_store(query, {})
print(df.iloc[0]['n'])


68013

Close. It seems there are a small number of impressions without a matching pageview.

Does each click have a matching Widget Impression?


In [28]:
query = """
SELECT *
FROM
    log.QuickSurveysResponses_15266417 r,
    log.QuickSurveyInitiation_15278946 i
WHERE
    r.event_surveyInstanceToken = i.event_surveyInstanceToken
    AND i.event_eventName ='impression'
""" 

d_click = query_analytics_store( query, {})
print(d_click.shape[0], len(set(d_click['event_surveyInstanceToken'])))


838 838

Each click has exactly one widget impression. One click without a matching rendered widget

Does each Google Survey Response have a registered click


In [43]:
d_survey = pd.read_csv('responses.tsv', sep = '\t')
st = 'This is you survey ID. Please do not modify.'
ct = 'event_surveyInstanceToken'
df_survey_click = d_survey.merge(d_click, how = 'inner', right_on = ct, left_on = st)

In [44]:
print(df_survey_click.shape[0])


139

In [45]:
print(d_survey.shape[0])


179

Does each Google Survey Response have a registered click when Survey was not shown to DNT users


In [46]:
d_survey['Timestamp'] = d_survey['Timestamp'].apply(lambda x: dateutil.parser.parse(x))

In [54]:
df_survey_click['Timestamp'] = df_survey_click['Timestamp'].apply(lambda x: dateutil.parser.parse(x))

In [55]:
d_survey[d_survey['Timestamp'] > '2016-02-25 16:20'][d_survey['Timestamp'] < '2016-02-25 17:10'].shape #12:20 1:08


Out[55]:
(36, 5)

In [57]:
df_survey_click[df_survey_click['Timestamp'] > '2016-02-25 16:20'][df_survey_click['Timestamp'] < '2016-02-25 17:10'].shape #12:20 1:08


Out[57]:
(33, 38)

Does each Google Survey Response have a registered impression?


In [18]:
query = """
SELECT
    event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
    event_eventName ='impression'
"""

d_impression = query_analytics_store(query, {})
df_survey_impression = d_survey.merge(d_impression, how = 'inner', right_on = 'event_surveyInstanceToken', left_on = st)

In [19]:
print(df_survey_impression.shape[0])


61

Only 61 out of 81 survey responses have a matching click.

Does each Google Survey Response have a registered pageview?


In [20]:
query = """
SELECT
    event_surveyInstanceToken
FROM log.QuickSurveyInitiation_15278946
WHERE
    event_eventName ='eligible'
"""

d_pageview = query_analytics_store(query, {})
df_survey_pageview = d_survey.merge(d_pageview, how = 'inner', right_on = 'event_surveyInstanceToken', left_on = st)

In [21]:
print(df_survey_pageview.shape[0])


61

Summary


In [21]:
print('Eligible Pageviews: ', n_eligible)
print('Widget Impressions: ', n_impressions)
print('Clicks: ', n_responses)
print('Yeses: ', n_yes)
print('Google Responses tracked in EL', df_survey_click.shape[0])
print('Google Responses', d_survey.shape[0])


Eligible Pageviews:  53373
Widget Impressions:  41957
Clicks:  463
Yeses:  134
Google Responses tracked in EL 59
Google Responses 81
  • Don't know how many users where in the sample
  • One in 300 pageviews results in a 'Yes'
  • 62% of 'Yes' events lead to a survery response
  • only 60% of survey responses are trackable in EL

Digging Into Missing Data


In [22]:
df_survey_click2 = d_survey.merge(d_click, how = 'left', right_on = ct, left_on = st)

In [23]:
df_survey_click2 = df_survey_click2[[st,'Timestamp', ct, 'timestamp']].sort('Timestamp')
df_survey_click2.columns = ['Google Token', 'Google Timestamp', 'EL Token', 'EL Timestamp']

In [24]:
df_survey_click2['Google Token'].apply(len).value_counts()


Out[24]:
16    81
Name: Google Token, dtype: int64

No obvious pattern...

Are surveySessionToken tokens unique per IP


In [25]:
query = """
SELECT
    COUNT(DISTINCT(clientIP)) as n
FROM log.QuickSurveyInitiation_15278946
"""
nIPs = query_analytics_store( query, {}).iloc[0]['n']
print(nIPs)


17573

In [26]:
query = """
SELECT
    COUNT(DISTINCT(CONCAT(clientIP, userAgent))) as n
FROM log.QuickSurveyInitiation_15278946
"""
nIPUAs = query_analytics_store( query, {}).iloc[0]['n']
print(nIPUAs)


18017

In [27]:
query = """
SELECT
    COUNT(DISTINCT(event_surveySessionToken)) as n
FROM log.QuickSurveyInitiation_15278946
"""
sessiontokens = query_analytics_store( query, {}).iloc[0]['n']
print(sessiontokens)


21338

Check User agent Distribution


In [109]:
query = """
SELECT COUNT(*) as n, userAgent
FROM log.QuickSurveyInitiation_15278946
WHERE event_eventName ='eligible'
GROUP BY userAgent
"""
el_ua_df = query_analytics_store( query, {})
el_ua_df['proportion'] = el_ua_df['n'] / el_ua_df['n'].sum()
el_ua_df['userAgent'] = el_ua_df['userAgent'].apply(lambda x: x[1:-1])

In [110]:
query = """
SELECT SUM(n) as n FROM ellery.ua_counts
"""
dn = query_hive_ssh( query, 'temp.txt')

query = """
SELECT n, user_agent 
FROM ellery.ua_counts
ORDER BY n DESC
LIMIT 1000
"""
webrequest_ua_df = query_hive_ssh( query, 'temp.txt')
webrequest_ua_df['proportion'] = webrequest_ua_df['n'] / dn.iloc[0]['n']

In [111]:
ua_merge = webrequest_ua_df.merge(el_ua_df, how ='left', left_on = 'user_agent', right_on = 'userAgent' )
ua_merge.fillna(0, inplace = True)
ua_merge['delta'] = ua_merge['proportion_x'] -  ua_merge['proportion_y']
ua_merge.sort('delta', inplace = True, ascending = False)

In [116]:
for i, r in ua_merge[['user_agent', 'delta']].head(25).iterrows():
    print(r['user_agent'], r['delta'])


NativeHost 0.0244573181121145
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 0.01253092194677231
Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko 0.009188252840339143
Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome) 0.004675413187936616
Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 0.004011174647886456
Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98) 0.003908672686999489
Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 0.0037288223318179617
Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0 0.0034854385468370093
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1) 0.0031351167149429203
Mozilla/4.0 (compatible; MSIE 4.01; Windows CE; PPC) 0.0029805168254509054
Blackboard Safeassign 0.002484239223036292
WikipediaApp/4.1.7.171 (iPhone OS 9.2.1; Phone) 0.002456977338558528
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/600.1.25 (KHTML, like Gecko) Version/8.0 Safari/600.1.25 0.002025239383686236
Mozilla/5.0 (Windows NT 6.3; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0 0.0019839397215352227
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 0.0018660161794041286
Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) 0.0014473865842308479
WikipediaApp/2.1.141-r-2016-02-10 (Android 5.1.1; Phone) Google Play 0.0014244334639631965
Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36 0.0012574544215368888
Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 0.0012418936309361393
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 0.0011633597673001421
Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko 0.001154960516022807
Mozilla/4.0 (compatible; Google Desktop) 0.00114499914806611
Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome) 0.0010802521724314193
Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko 0.0009943697753306059
Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) AppleWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53 0.000960850508479967

In [7]:
query = """
    SELECT
        webHost,
        page_title as title,
        event_surveyInstanceToken as survey_token,
        clientIp,
        userAgent,
        CAST(DATE_FORMAT(STR_TO_DATE(timestamp, '%%Y%%m%%d%%H%%i%%S'), '%%Y-%%m-%%d %%H:%%i:%%S') AS CHAR(22) CHARSET utf8) AS timestamp
    FROM
        log.QuickSurveysResponses_15266417 s,
        enwiki.page p
    WHERE
        p.page_id = s.event_pageId 
        AND s.event_surveyResponseValue ='ext-quicksurveys-external-survey-yes-button'
    """

df =  query_analytics_store( query, {})

In [9]:
(df['clientIp'] + df['userAgent']).value_counts().value_counts()


Out[9]:
1    54574
2      359
3       27
4       11
6        2
5        2
dtype: int64

In [ ]: